---
title: "Topic model"
author: "Katrina Keegan"
date: "10/6/2021"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(topicmodels)
library(tidytext)
library(ggraph)
library(extrafont)
```

```{r, warning = FALSE}
# Creating the document term matrix

# Change the file name (file must be in the folder where this code is)

tokenized_texts = read_csv("TOKENS_ukraina_ru_vaccine.csv", col_types = cols(
  X1 = col_double(),
  token = col_character(),
  index = col_double()
))

counted_tokenized_texts = tokenized_texts %>%
  select(token, index) %>%
  filter(token != "") %>%
  filter(token != "должный") %>%
  filter(token != "сказать") %>%
  filter(token != "мочь") %>%
  filter(token != "однако") %>%
  filter(token != "именно") %>%
  filter(token != "становиться") %>%
  filter(token != "поэтому") %>%
  filter(token != "слово") %>%
  filter(token != "хотя") %>%
  filter(token != "пока") %>%
  group_by(index, token) %>%
  summarize(count = n(), .groups = "drop")
  
dtm <- counted_tokenized_texts %>% 
  cast_dtm(index, token, count)



```


```{r}
# Pick an LDA model
# First change the k variable over a few options, and run the folowing code chunk to see which one makes the most sense.
# Then change the seed to a few options to see if randomness makes one that is more sensible than the others
# Save all the previous ones as it takes a long time to run on big data and you don't want to have to rerun if possible.

#This is the LDA function from the topicmodels package to actually perform the topic modelling. The control = list(seed) is supposed to make sure that the randomized output is the same each time.
#k is the number of topics.

lda_2 <- LDA(dtm, k = 2,control = list(seed = 48))
# Foreign relations, medical vaccination campaign, unclear (political?)
lda_3 <- LDA(dtm, k = 3,control = list(seed = 48))

# This one pretty good, emphasizes procurement
# Procurement, vaccination elsewhere, Ukraine international, medical
lda_4 <- LDA(dtm, k = 4, control = list(seed = 48))

# Vaccine procurment, vaccine campaign, international, medical, Russia-Ukraineish 
lda_5 <- LDA(dtm, k = 5, control = list(seed = 48))

# GOING WITH THIS ONE
#Ukraine-China, Ukraine-Russia-US, vaccination campaign, politics
lda_4a <- LDA(dtm, k = 4, control = list(seed = 138))

#Ukraine-China, Ukraine-Russia, vaccination campaign, politics, ?
lda_5a <- LDA(dtm, k = 5, control = list(seed = 138))

#Russia-Ukraine-humanitarian, international politics, domestic politics, vaccination campaign
lda_4b <- LDA(dtm, k = 4, control = list(seed = 764))

#Vaccination campaign, domestic politics, pandemic, international politics 
lda_4c <- LDA(dtm, k = 4, control = list(seed = 233))

lda4d <- LDA(dtm, k = 4, control = list(seed = 238))

```


```{r}
# View the words in each topic

#the tidy function from tidytext returns the topic model results to a tibble.

tidy_lda <- tidy(lda_4a)

#The following slices the top 20 words for each topic

# Add or remove topics here as needed, and change the name of the topic to its actual name
top_20 <- tidy_lda %>%
  group_by(topic) %>%
  filter(term != "говорить") %>%
  filter(term != "время") %>%
  filter(term != "просто") %>%
  filter(term != "ранее") %>%
  filter(term != "являться") %>%
  filter(term != "самый") %>%
  filter(term != "называть") %>%
  filter(term != "отмечать") %>%
  slice_max(beta, n = 30) %>%
  mutate(topic = replace(topic, topic == 1, "Ukraine-China-Russia")) %>%
  mutate(topic = replace(topic, topic == 2, "Ukraine-Russia-US")) %>%
  mutate(topic = replace(topic, topic == 3, "Vaccination, pandemic")) %>%
  mutate(topic = replace(topic, topic == 4, "Domestic politics")) 
  
```


```{r}
# Add translations
english_translation <- tibble(english = c(#topic 1
                                          "Ukraine",
                                          "China",
                                          "country",
                                          "Kyiv",
                                          "Chinese",
                                          "Ukrainian",
                                          "Russia",
                                          "person",
                                          "vaccine",
                                          "Russian",
                                          "UN",
                                          "PRC",
                                          "announce",
                                          "Zelensky",
                                          "Beijing",
                                          "issue",
                                          "state",
                                          "hospital",
                                          "receive",
                                          "Crimea",
                                          "Kazakhstan",
                                          "child",
                                          "relations",
                                          "against",
                                          "doctor",
                                          "situation",
                                          "Chinese person",
                                          "place",
                                          "want",
                                          "offer",
                                          # Topic 2
                                          "Ukraine",
                                          "Russia",
                                          "country",
                                          "Zelensky",
                                          "person",
                                          "US",
                                          "president",
                                          "authorities/power",
                                          "issue",
                                          "Kyiv",
                                          "situation",
                                          "Ukrainian",
                                          "our",
                                          "would/peace",
                                          "new",
                                          "political",
                                          "side",
                                          "Donbass",
                                          "consider",
                                          "Biden",
                                          "relations",
                                          "Russian",
                                          "think",
                                          "vaccine",
                                          "war",
                                          "problem",
                                          "announce",
                                          "state",
                                          "very",
                                          "occur",
                                          # Topic 3
                                          "vaccine",
                                          "Ukraine",
                                          "country",
                                          "vaccination",
                                          "person",
                                          "announce",
                                          "coronavirus",
                                          "medication",
                                          "report",
                                          "receive",
                                          "COVID-19",
                                          "Russia",
                                          "Sputnik V",
                                          "innoculation",
                                          "world",
                                          "Russian vaccine",
                                          "EU",
                                          "population",
                                          "vaccinate",
                                          "authorities",
                                          "first",
                                          "innoculate",
                                          "day",
                                          "Ukrainian person",
                                          "data",
                                          "case",
                                          "coronavirus vaccine",
                                          "dose",
                                          "US",
                                          "Ministry of Health",
                                          # Topic 4
                                          "Ukraine",
                                          "Zelensky",
                                          "vaccine",
                                          "country",
                                          "person",
                                          "Ukrainian",
                                          "authorities",
                                          "announce",
                                          "president",
                                          "issue",
                                          "first",
                                          "receive",
                                          "vaccination",
                                          "against",
                                          "Ukrainian person",
                                          "Russia",
                                          "party",
                                          "new",
                                          "matter/affair",
                                          "(Rada) deputy",
                                          "political",
                                          "company",
                                          "decision",
                                          "Europe",
                                          "journalist",
                                          "Medvedchuk",
                                          "Poroshenko",
                                          "consider",
                                          "situation",
                                          "want"
                                          ))

top_20 <- bind_cols(top_20, english_translation)
```


```{r}
# Make the graph showing key words

top_30_plot <- top_20 %>%
  #the following code is copy and pasted from https://www.tidytextmining.com/nasa.html#interpreting-the-topic-model
  #The goal is to have each of the topics show words from high beta to low beta for each topic (rather than overall beta ranking)
  mutate(english = reorder_within(english, beta, topic)) %>%
  group_by(topic, english) %>%
  arrange(desc(beta)) %>%
  ungroup() %>%
  ggplot(aes(x = beta, y = english, fill = topic)) +
  geom_col(show.legend = FALSE) +
  #this refers back to the ordering in reorder_within()
  scale_y_reordered() +
  #scales = "free" means that it won't keep the same y scale (the same terms) for every topic
  facet_wrap(~topic, scales = "free", ncol = 4) +
  scale_x_continuous(n.breaks = 4, labels = scales::percent) +
  scale_fill_manual(values = c("#586BA4", "#F5DD90", "#B0413E", "#49A078")) +
  labs(x = "Percent of the topic comprised of the word", y = "", title = "") +
  theme_bw() +
  theme(axis.title = element_text(size = 8))
  
ggsave(filename = "uk_top_30_plot.png", plot = top_30_plot, width = 10, height = 5, units = "in")

```


```{r}
#Finding what the main topic is for each article
gamma <- tidy(lda_4a, matrix = "gamma") %>%
  mutate(topic = replace(topic, topic == 1, "Ukraine-China-Russia")) %>%
  mutate(topic = replace(topic, topic == 2, "Ukraine-Russia-US")) %>%
  mutate(topic = replace(topic, topic == 3, "Vaccination, pandemic")) %>%
  mutate(topic = replace(topic, topic == 4, "Domestic politics")) 

main_topic = gamma %>%
  group_by(document) %>%
  mutate(max_gamma = max(gamma)) %>%
  filter(gamma == max(gamma)) %>%
  group_by(topic) %>%
  summarize(number_articles = n(), .groups = "drop") %>%
  mutate(percent = number_articles / sum(number_articles)) %>%
  mutate(percent = round(percent, 3)) %>%
  mutate(labels = scales::percent(percent))

#Getting random articles to read
set.seed(48)
random_articles <- gamma %>%
  group_by(document) %>%
  mutate(max_gamma = max(gamma)) %>%
  filter(gamma == max(gamma)) %>%
  filter(max_gamma > 0.9) %>%
  group_by(topic) %>%
  slice_sample(n = 10)

mix_random <- gamma %>%
  group_by(document) %>%
  mutate(max_gamma = max(gamma)) %>%
  filter(gamma == max(gamma)) %>%
  filter(max_gamma < 0.6) %>%
  group_by(topic) %>%
  slice_sample(n = 2)

# Finding average max gamma
uk_gamma <- gamma %>%
  group_by(document) %>%
  mutate(max_gamma = max(gamma)) %>%
  filter(gamma == max(gamma)) %>%
  mutate(country = "uk") %>%
  ungroup() %>%
  select(country, max_gamma) %>%
  summarize(avg = mean(max_gamma)) %>%
  pull()
```


```{r}
#How to make a pie chart here: https://r-charts.com/part-whole/pie-chart-percentages-ggplot2/


uk_pie_plot <- main_topic %>%
  # Makes sure it's in the same order as above (alphabatizes)
  group_by("topic") %>%
  ggplot(aes(x = "", y = percent, fill = topic)) +
  geom_col() +
  geom_text(aes(label = labels),
            position = position_stack(vjust = 0.5),
            size = 8) +
  coord_polar(theta = "y") +
  theme_void() +
  labs(title = "", fill = "") +
  theme(legend.position = "right", legend.text = element_text(size = 20)) +
  scale_fill_manual(values = c("#586BA4", "#F5DD90", "#B0413E", "#49A078")) 

ggsave(filename = "uk_pie_plot.png", plot = uk_pie_plot, width = 9, height = 6, units = "in")

```


